This example uses the tidyverse suite of packages.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(visdat)
library(ggbeeswarm)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Please download the final project data from Canvas. If this Rmarkdown file is located in the same directory as the downloaded CSV file, it will be able to load in the data for you. It is highly recommended that you use an RStudio RProject to more easily manage the working directory and file paths of the code and objects associated with the final project.
The code chunk below reads in the final project data.
df <- readr::read_csv("fall2022_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The readr::read_csv() function displays the data types
and column names associated with the data. However, a glimpse is shown
below that reveals the number of rows and also shows some of the
representative values for the columns.
df %>% glimpse()
## Rows: 1,252
## Columns: 11
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.0…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.0…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.0…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.0…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.2…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.0…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.1…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.6…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.2…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"…
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874, …
output variable.df_reg<- df%>%
mutate(y = boot::logit(output)) %>%
select(x1:x4,v1:v5,m,y)%>%
glimpse()
## Rows: 1,252
## Columns: 11
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.02440…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.03183…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.02205…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.05575…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.23012…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.00521…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.16544…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.69071…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.20876…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ y <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320223, …
df_classify<-df %>%
mutate(outcome = ifelse(output < 0.33, 'event', 'non_event'),
outcome = factor(outcome, levels = c("event", "non_event"))) %>%
select(x1:x4,v1:v5,m,outcome)%>%
glimpse()
## Rows: 1,252
## Columns: 11
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …
df_derived_reg<-df %>%
mutate(x5 = 1 - (x1 + x2 + x3 + x4),
w = x2 / (x3 + x4),
z = (x1 + x2) / (x5 + x4),
t = v1 * v2,
y = boot::logit(output)) %>%
select(m,x5,t,w,z,y)%>%
glimpse()
## Rows: 1,252
## Columns: 6
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ x5 <dbl> 0.212588, 0.153418, 0.689014, 0.348834, 0.388381, 0.625701, 0.86595…
## $ t <dbl> 0.009277586, 0.009294651, 0.151249854, 0.266428788, 2.604629249, 5.…
## $ w <dbl> 0.50619858, 0.47195344, 0.07709835, 0.10712990, 0.80796683, 0.08579…
## $ z <dbl> 1.25050808, 1.39745312, 0.05731369, 0.83844661, 0.65315580, 0.08546…
## $ y <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320223, …
df_derived_classify<-df %>%
mutate(x5 = 1 - (x1 + x2 + x3 + x4),
w = x2 / (x3 + x4),
z = (x1 + x2) / (x5 + x4),
t = v1 * v2,
outcome = ifelse(output < 0.33, 'event', 'non_event'),
outcome = factor(outcome, levels = c("event", "non_event"))) %>%
select(x5,w,z,t,m,outcome)%>%
glimpse()
## Rows: 1,252
## Columns: 6
## $ x5 <dbl> 0.212588, 0.153418, 0.689014, 0.348834, 0.388381, 0.625701, 0.…
## $ w <dbl> 0.50619858, 0.47195344, 0.07709835, 0.10712990, 0.80796683, 0.…
## $ z <dbl> 1.25050808, 1.39745312, 0.05731369, 0.83844661, 0.65315580, 0.…
## $ t <dbl> 0.009277586, 0.009294651, 0.151249854, 0.266428788, 2.60462924…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …
names(df)
## [1] "x1" "x2" "x3" "x4" "v1" "v2" "v3" "v4"
## [9] "v5" "m" "output"
df[1:5,]
## # A tibble: 5 × 11
## x1 x2 x3 x4 v1 v2 v3 v4 v5 m output
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 0.0259 0.256 0.493 0.0128 0.276 0.0337 1.17 0.408 0.525 A 0.786
## 2 0.0308 0.262 0.498 0.0558 0.343 0.0271 1.26 0.664 2.87 A 0.73
## 3 0.0193 0.0209 0.258 0.0124 5.00 0.0303 1.30 0.413 0.409 A 0.996
## 4 0.306 0.0334 0.255 0.0562 5.09 0.0523 1.32 0.652 0.862 A 0.326
## 5 0.0313 0.259 0.264 0.0566 5.03 0.518 1.37 0.534 6.45 A 0.735
The data have continuous inputs and a categorical input. The
continuous inputs consist of two groups of variables, the “x-variables”,
x1 through x4, and the “v-variables”,
v1 through v5. The categorical input is
m. The response is continuous and is named
output.
visdat::vis_miss(df)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
## Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.
No Missing values present in the Data set
visdat::vis_dat(df)
There is only one Categorical Column and rest are Continuous.
df %>% purrr::map_dbl(n_distinct)
## x1 x2 x3 x4 v1 v2 v3 v4 v5 m output
## 1245 1250 1250 1235 1252 1249 1252 1252 1252 5 690
One of the variables have relatively few unique values! Checking the counts associated with each unique value for m.
df %>% count(m)
## # A tibble: 5 × 2
## m n
## <chr> <int>
## 1 A 227
## 2 B 252
## 3 C 265
## 4 D 260
## 5 E 248
ggplot(df, aes(x=as.factor(m))) +
geom_bar(color="black", fill="gold")+
scale_y_continuous(expand=c(0.1,0.1)) +
geom_text(stat='count',aes(label = ..count..), vjust = -1)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
M values are uniform.
df %>%
select(-output) %>%
distinct() %>%
dim()
## [1] 1252 10
No Duplicate Rows as there are 1252 rows of Data in the Data set
ggplot(df_classify, aes(x=as.factor(outcome))) +
geom_bar(color="black", fill="blue")+
scale_y_continuous(expand=c(0.1,0.1)) +
geom_text(stat='count',aes(label = ..count..), vjust = -1)
df_classify%>%
ggplot(mapping = aes(x = m, fill = outcome, label = ..count..))+
geom_bar(width = 0.5)+
stat_count(geom = "text", colour = "black", size = 3.5, ,position=position_stack(vjust=0.5))+
theme_bw()+
scale_fill_brewer(palette="Accent")
df_classify %>%
count(m, outcome) %>%
ggplot(mapping = aes(x = as.factor(m), y = as.factor(outcome))) +
geom_tile(mapping = aes(fill = n), color = 'black') +
geom_text(mapping = aes(label = n,
color = n > median(n)),
size = 7) +
scale_fill_viridis_c(guide = 'none', option = 'magma') +
scale_color_manual(guide = 'none',
values = c("TRUE" = 'black',
"FALSE" = 'white')) +
theme_bw()
Non event is more compared to event for the input value m with respect to Outcome.
Correlation between continuous output w.r.t continuous inputs
df_reg %>% select(x1,x2,x3,x4,v1,v2,v3,v4,v5 ,y) %>% cor() %>% corrplot::corrplot(type = 'upper')
input_names <- c("x1", "x2", "x3", "x4", "v1","v2","v3","v5")
df_reg %>%
select(all_of(input_names)) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid")) %>%
ggplot(mapping = aes(x = value)) +
geom_histogram(bins = 35) +
facet_wrap(~name, scales = "free") +
theme_bw() +
theme(axis.text.y = element_blank())
df_reg %>%
select(-v1,-v2,-v3,-v4,-v5) %>%
GGally::ggpairs(progress = FALSE,
diag = list(continuous = GGally::wrap('barDiag', bins=25))) +
theme_bw()
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_reg %>%
select(-x1,-x2,-x3,x4) %>%
GGally::ggpairs(progress = FALSE,
diag = list(continuous = GGally::wrap('barDiag', bins=25))) +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_reg %>%
select(x1,x2,x3,x4,v1,v2,v3,v4,v5) %>%
cor() %>%
corrplot::corrplot(method = 'number', type = 'upper',order="hclust")
df_derived_reg %>%
select(x5,t,w,z) %>%
cor() %>%
corrplot::corrplot(method = 'number', type = 'upper',order="hclust")
#create pairs plot
pairs.panels(df_derived_reg)
pairs.panels(df_derived_classify)
df_reg %>%
pivot_longer(starts_with("x")) %>%
ggplot(aes(x=value)) +
geom_density(aes(color=m), bins=50) +
facet_wrap(~name, scales="free", ncol=8) +
ylab("") + xlab("")
## Warning in geom_density(aes(color = m), bins = 50): Ignoring unknown parameters:
## `bins`
All x values have a smiliar trend with respect to m.
df_reg %>%
pivot_longer(starts_with("v")) %>%
ggplot(aes(x=value)) +
geom_density(aes(color=m), bins=50) +
facet_wrap(~name, scales="free", ncol=8) +
ylab("") + xlab("")
## Warning in geom_density(aes(color = m), bins = 50): Ignoring unknown parameters:
## `bins`
All v values have a smiliar trend with respect to m.
df%>% select(output) %>%
ggplot() +
geom_histogram(aes(x=output), bins=50)
After logit Transformation of y
df_reg%>% select(y) %>%
ggplot() +
geom_histogram(aes(x=y), bins=50)
ggplot(df, aes(x = x1, y = output)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = x1, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = x2, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = x3, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = x4, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = v1, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = v2, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = v3, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = v4, y = y)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggplot(df_reg, aes(x = v5, y = y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, colour = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_reg, aes(x = x2, y = y, colour = m)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(
aes(x = x2, y = y),
method = "lm", se = FALSE, inherit.aes = FALSE,
colour = "black", size = 1, linetype = "dashed"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'